# library inclusions
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(stringr)
## Loading required package: stringr
require(tidyr)
## Loading required package: tidyr
require(ggplot2)
## Loading required package: ggplot2
require(rdrobust)
## Loading required package: rdrobust
require(AER)
## Loading required package: AER
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
require(plm)
## Loading required package: plm
##
## Attaching package: 'plm'
## The following objects are masked from 'package:dplyr':
##
## between, lag, lead
require(stargazer)
## Loading required package: stargazer
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
# load in cleaned data
df <- read.csv(file = "cleaned_data_csv.csv")
repeat_runners <- read.csv(file = "same_runners.csv")
# renaming variables for convenience and consistency
df <- df %>%
rename(year = Year) %>%
rename(name = Name) %>%
rename(age = Age) %>%
rename(t_hour = T_Hour) %>%
rename(t_min = T_Min) %>%
rename(t_sec = T_Sec) %>%
rename(p_min = P_Min) %>%
rename(p_sec = P_Sec) %>%
rename(pis.tis = PiS.TiS) %>%
rename(pid.tid = PiD.TiD) %>%
rename(hometown = Hometown)
repeat_runners <- repeat_runners %>%
rename(name = Name) %>%
rename(year_of_birth = Year...Age) %>%
rename(hometown = Hometown) %>%
rename(races = Races_Ran)
# converting Time/Pace into minutes and putting them in separate columns
df <- df %>%
mutate(time = t_hour * 60 + t_min + t_sec / 60,
pace = p_min + p_sec / 60)
## create gender column from name
getgender <- function(x) {
substr(x, nchar(x) - 1, nchar(x) - 1)
}
#
repeat_runners$name <- iconv(enc2utf8(repeat_runners$name), sub = "byte")
repeat_runners <- repeat_runners %>%
mutate(gender = sapply(name, getgender)) %>%
mutate(name = str_sub(name, 1, -5))
#
unique(repeat_runners$gender) # 1 case F instead of W
## [1] "M" "W" "F"
repeat_runners <- repeat_runners %>%
mutate(gender = case_when(gender == "F" ~ "W", TRUE ~ gender))
#
df$name <- iconv(enc2utf8(df$name), sub = "byte")
df <- df %>%
mutate(gender = sapply(name, getgender)) %>%
mutate(name = str_sub(name, 1, -5))
#
unique(df$gender) # 1 case F instead of W
## [1] "M" "W" "F"
df <- df %>%
mutate(gender = case_when(gender == "F" ~ "W", TRUE ~ gender))
# Plot "runtime" against "age"
ggplot(data = df,
aes(x = age,
y = time)) +
geom_point(shape = 1, size = 0.5) +
xlab("age in years") +
ylab("Run time in minutes") +
ylim(40, 180) +
ggtitle("Runtime against age, by gender") +
facet_wrap(gender ~ .)
## Warning: Removed 1196 rows containing missing values (geom_point).
# Boxplot runtime against age category, by gender
## Create age group by decade
df <- df %>%
mutate(agecat = 10 * floor(age / 10)) %>%
mutate(agecat = as.character(agecat))
ggplot(data = df %>%
mutate(gender=case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women"
)),
aes(x = agecat,
y = time)) + # by age groups in decades
geom_boxplot() + # boxplot
xlab("Age in decades") +
ylab("Run time in minutes") +
ylim(40, 150) +
<<<<<<< HEAD
ggtitle("Running performance for age groups (in decades)") +
=======
<<<<<<< HEAD
ggtitle("Running performance for age groups (in decades)") +
=======
ggtitle("Composition of age groups for each performance percentile") +
labs(fill = "age groups in decade") +
>>>>>>> 9f0c4fced4ea2a97b24453989f8ab2483201b2e0
>>>>>>> 61b1d6c783da541131b5b49307924c86bde0dbeb
facet_wrap(gender ~ .)
## Warning: Removed 1509 rows containing non-finite values (stat_boxplot).
# Linear regression
df <- df %>%
filter(!is.na(time))
model <- lm(time ~ age + gender + age:gender, data = df)
summary(model)
##
## Call:
## lm(formula = time ~ age + gender + age:gender, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.843 -10.726 -1.149 9.425 115.116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.248319 0.138429 529.139 < 2e-16 ***
## age 0.308354 0.003458 89.177 < 2e-16 ***
## genderW 15.721519 0.197472 79.614 < 2e-16 ***
## age:genderW -0.031402 0.005228 -6.006 1.9e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.36 on 340331 degrees of freedom
## Multiple R-squared: 0.1861, Adjusted R-squared: 0.1861
## F-statistic: 2.595e+04 on 3 and 340331 DF, p-value: < 2.2e-16
# Residual diagnostic
plot(fitted(model), rstandard(model)); abline(0, 0) # residual pattern against fitted response
qqnorm(rstandard(model)); qqline(rstandard(model)) # normality of residual
hist(rstandard(model),
breaks = 30,
xlim = c(-6, 6),
freq = FALSE); curve(dnorm, add = TRUE)
smoothScatter(x = df$age,
y = model$residuals,
xlab = "ages",
<<<<<<< HEAD
ylab = "Residuals"); abline(h = 0)
# # DISABLED: this takes a reallyyyyy long time: temporarily disabling
=======
<<<<<<< HEAD
ylab = "Residuals"); abline(h = 0)
# # DISABLED: this takes a reallyyyyy long time: temporarily disabling
=======
ylab = "Residuals"); abline(h = 0)
# # DISABLED: this takes a reallyyyyy long time: temporarily disabling
>>>>>>> 9f0c4fced4ea2a97b24453989f8ab2483201b2e0
>>>>>>> 61b1d6c783da541131b5b49307924c86bde0dbeb
# resid.lo <- loess(resids ~ age,
# data = data.frame(resids = rstandard(model),
# age = df$age))
# res.lo.pred <- predict(resid.lo, newdata = data.frame(age = 10:100))
# smoothScatter(x = df$age,
# y = model$residuals,
# xlab = "ages",
# ylab = "Residuals")
# abline(h = 0)
# lines(x = 10:100, y = res.lo.pred, col = "red", lwd = 2, lty = 3)
# Non parametric model
df.m <- df[df$gender == "M",]
df.w <- df[df$gender == "W",]
# # DISABLED: this takes a reallyyyyy long time: temporarily disabling
# model.loess.m <- loess(time ~ age, data = df.m)
# model.loess.w <- loess(time ~ age, data = df.w)
# plot(y = predict(model.loess.m), x = df.m$age,
# xlab = "age",
# ylab = "Predicted time (minutes)",
# main = "Prediction of Performance using Nonparametric model - Men (Loess method)")
# plot(y=predict(model.loess.w), x=df.w$age,
# xlab = "age",
# ylab = "Predicted time (minutes)",
# main = "Prediction of Performance using Nonparametric model - Women (Loess method)")
# NEED SOME DESCRIPTIVE STATISTICS HERE!
# TABLES and GRAPHS show info of age, running time, missing data, number of
# observations (average, median, max, min) by YEAR
# eg. histogram of age (show the composition of age in each race, remember to
# include the number of observations in notes/legend/etc)
# eg. histogram of age by gender (for those years that number of repeat_runners are
# stable and without many missing data - LOOK OUT FOR MISSING DATA - is there a
# pattern in missing data???)
# A GRAPH IN VIZUALIZATION LAB WE DIDN"T NEED TO DRAW
# the one I sent on 11/14/2022 10:44 AM
# Maybe the GRAPH IN VIZUALIZATION LAB we did draw as our answer.
# Answer questions (with graph and/or statistics):
# ? What is the peak performing age for men? for women?
# https://pubmed.ncbi.nlm.nih.gov/31174325/
# ? What is the slow down rate after peak age of men? of women? (percent per
# decade? choose another unit of measurement if you find it"s more
# accurate/intuitive)
# temporary dataframe
temp <- df %>%
select(year, age) %>%
mutate(age = as.character(10 * floor(age / 10)))
# age_labels <- c(
# "<10",
# "10-19",
# "20-29",
# "30-39",
# "40-49",
# "50-59",
# "60-69",
# "70-79",
# "80-89",
# ">89"
# )
# stacked line plot showing age distribution for year
ggplot(temp,
aes(x = year,
group = age,
fill = age)) +
geom_bar() +
ggtitle("Age distribution of participants") +
xlab("Year") +
ylab("Number of participants")
# show proportions for each year
ggplot(temp,
aes(x = year,
group = age,
fill = age)) +
geom_bar(position = "fill") +
ggtitle("Age distribution of participants") +
xlab("Year") +
ylab("Proportion of participants (%)")
# statistics of number of participants for each year
temp <- df %>%
select(year, age) %>%
group_by(year)
summarise(temp)
## # A tibble: 47 × 1
## year
## <int>
## 1 1973
## 2 1974
## 3 1975
## 4 1976
## 5 1977
## 6 1978
## 7 1979
## 8 1980
## 9 1981
## 10 1982
## # … with 37 more rows
Model using panel data may mitigate omitted variable bias when there is no information on variables that correlate with both the regressors of interest and the independent variable and if these variables are constant in the time dimension or across entities.
# MODEL FOR TRACKED RUNNERS RECORDS
# install.packages("plm")
# install.packages("stargazer")
# install.packages("AER")
library(AER)
library(plm)
library(stargazer)
hist(repeat_runners$races)
## Create key variable in df to merge later
df <- df %>% mutate(year_of_birth=year-age)
# create ID for each runner in repeat runner list
rerun <- repeat_runners %>%
mutate(id=row_number()) %>%
select(id, name:gender)
## Filter those who ran at least 5 races
rerun <- rerun %>% filter(races>=5)
## Merge the performance info from df
rerun <- rerun %>% left_join(y = df,by = c("name","year_of_birth","gender","hometown"))
check_merge <- rerun %>% # number of races not agree with number of records (maybe due to change of hometown)
group_by(id) %>%
summarise(n_races=mean(races),
n_merge=n()) %>%
filter(n_races != n_merge)
rerun_dup <- rerun %>% # multiple people with same id (name, yob, gender, hometown) and different performance
group_by(id,year) %>%
mutate(n_dup=n()) %>%
filter(n_dup>1)
# remove merging error
rerun <- rerun %>%
group_by(id,year) %>%
mutate(n_dup=n()) %>%
filter(n_dup==1) %>%
select(-n_dup)
## Random sampling
# set.seed(2022)
# rerun[rerun$id==sample(max(rerun$id), size=2), ]
## Change id, gender, year to factor variables
rerun <- within(rerun, {
id <- factor(id)
gender <- factor(gender)
year <- factor(year)
})
head(rerun)
## # A tibble: 6 × 18
## # Groups: id, year [6]
## id name year_…¹ homet…² races gender year age t_hour t_min t_sec p_min
## <fct> <chr> <int> <chr> <int> <fct> <fct> <int> <int> <int> <int> <int>
## 1 1 Jay J… 1950 Arling… 39 M 1979 29 1 1 48 6
## 2 1 Jay J… 1950 Arling… 39 M 1980 30 0 52 41 5
## 3 1 Jay J… 1950 Arling… 39 M 1981 31 0 54 47 5
## 4 1 Jay J… 1950 Arling… 39 M 1982 32 0 56 53 5
## 5 1 Jay J… 1950 Arling… 39 M 1983 33 0 57 52 5
## 6 1 Jay J… 1950 Arling… 39 M 1984 34 0 54 51 5
## # … with 6 more variables: p_sec <int>, pis.tis <chr>, pid.tid <chr>,
## # time <dbl>, pace <dbl>, agecat <chr>, and abbreviated variable names
## # ¹​year_of_birth, ²​hometown
## Panel data graph for first 10 runners
ggplot(data = rerun %>%
filter(as.numeric(id)<11) %>%
mutate(gender = case_when(
gender=="M"~"Men",
gender=="W"~"Women"
)),
aes(x = age, y = time, group = id)) +
geom_line(aes(color=id)) + # line graph
xlab("Age (in years)") +
ylab("Race time (in minutes)") +
ggtitle("Running time of top 10 individual runners that participated in the most races") +
theme(legend.position="none")
## Panel data graph for first 50 runners by gender
ggplot(data = rerun %>%
filter(as.numeric(id)<51) %>%
mutate(gender = case_when(
gender=="M"~"Men",
gender=="W"~"Women"
)),
aes(x = age, y = time, group = id)) +
geom_line(aes(color=id)) + # line graph
xlab("Age (in years)") +
ylab("Race time (in minutes)") +
ggtitle("Running time of top 50 individual runners that ran the most races") +
facet_grid(.~ gender) +
theme(legend.position="none") +
stat_summary(group= F, geom="line", fun = "mean", color="black", size=1, linetype="solid")
## Panel data graph for first 50 runners by gender with smooth line
ggplot(data = rerun %>%
filter(as.numeric(id)<51) %>%
mutate(gender = case_when(
gender=="M"~"Men",
gender=="W"~"Women"
)),
aes(x = age, y = time, group = id)) +
geom_line(aes(color=id)) + # line graph
xlab("Age (in years)") +
ylab("Race time (in minutes)") +
ggtitle("Running time of top 50 individual runners that ran the most races") +
facet_grid(.~ gender) +
theme(legend.position="none") +
# stat_summary(group= F, geom="line", fun = "mean", color="black", size=1, linetype="solid") +
geom_smooth(group= F, method = "loess", fill = NA)
## `geom_smooth()` using formula 'y ~ x'
Based on the average line and Loess smooth line, two possible models are linear and quadratic model.
## OVERALL PANEL DATA MODEL
### Linear model
model.panel_l <- plm(time~age + gender+age:gender,
data = rerun %>%
mutate(gender= case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women"
)),
index=c("id", "year"),
model = "within",
effect = "time")
## Warning in pdata.frame(data, index): at least one NA in at least one index dimension in resulting pdata.frame
## to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
coeftest(model.panel_l, vcov. = vcovHC, type = "HC1")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## age 0.433327 0.018099 23.9421 < 2.2e-16 ***
## genderWomen 18.863707 1.225592 15.3915 < 2.2e-16 ***
## age:genderWomen -0.175017 0.028709 -6.0963 1.093e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Quadratic model
model.panel_q <- plm(time~ age + age2 + gender
+ age:gender + age2:gender,
data = rerun %>%
mutate(age2 = age^2) %>%
mutate(gender= case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women"
)),
index=c("id", "year"),
model = "within",
effect = "time")
## Warning in pdata.frame(data, index): at least one NA in at least one index dimension in resulting pdata.frame
## to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
coeftest(model.panel_q, vcov. = vcovHC, type = "HC1")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## age -0.4539771 0.1169561 -3.8816 0.0001039 ***
## age2 0.0095219 0.0013044 7.2996 2.925e-13 ***
## genderWomen 10.2275025 3.8157079 2.6804 0.0073563 **
## age:genderWomen 0.1840870 0.1790245 1.0283 0.3038235
## age2:genderWomen -0.0035009 0.0020393 -1.7167 0.0860378 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Choose between the two (linear vs. quadratic)
Sum1 <- summary(model.panel_l)
RSS1 <- sum(Sum1$residuals^2)
K1 <- max(model.panel_l$assign)
N1 <- length(model.panel_l$residuals)
n1 <- N1 - K1 - model.panel_l$df.residual
AIC_model.panel_l = log(RSS1/n1) + (2*K1)/n1
Sum2 <- summary(model.panel_q)
RSS2 <- sum(Sum2$residuals^2)
K2 <- max(model.panel_q$assign)
N2 <- length(model.panel_q$residuals)
n2 <- N2 - K2 - model.panel_q$df.residual
AIC_model.panel_q = log(RSS2/n2) + (2*K2)/n2
Is linear model better than second order model?
AIC_model.panel_l<AIC_model.panel_q
## [1] TRUE
## Panel model residual diagnostics
qqnorm(model.panel_l$residuals); qqline(model.panel_l$residuals)
boxcoxResult<-MASS::boxcox(time ~ age + gender +age:gender +year, data = rerun)
lambda = boxcoxResult$x[which.max(boxcoxResult$y)];lambda
## [1] -0.06060606
### lambda close to 0 so do log-transformation of time
model.panel_l <- plm(log_time~age + gender+age:gender,
data = rerun %>%
mutate(log_time=log(time)) %>%
mutate(gender= case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women"
)),
index=c("id", "year"),
model = "within",
effect = "time")
## Warning in pdata.frame(data, index): at least one NA in at least one index dimension in resulting pdata.frame
## to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
coeftest(model.panel_l, vcov. = vcovHC, type = "HC1")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## age 0.00511459 0.00020357 25.1251 < 2.2e-16 ***
## genderWomen 0.23557247 0.01341752 17.5571 < 2.2e-16 ***
## age:genderWomen -0.00245713 0.00030610 -8.0271 1.017e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.panel_l$coefficients
## age genderWomen age:genderWomen
## 0.005114592 0.235572473 -0.002457131
# generate predict function
pred_log <- function(data, model) {
data$pred_time = exp(predict(model)); data
}
pred_log(rerun, model = model.panel_l)
ggplot(data = pred_log(rerun, model.panel_l) %>%
filter(age>=10),
aes(x = age, y = time, group = id)) +
geom_line(aes(color=id)) + # line graph
xlab("Age") +
ylab("Race time") +
ggtitle("Running time of tracked runners throughout races predicted") +
labs(title = "Predicted running time of tracked runners throughout races",
subtitle = "Log-level time-fixed effect model",
caption = "") +
theme(legend.position="none") +
# geom_abline(slope=model.panel_t1$coefficients[1],
# intercept= summary(t1_check)$coefficients[1,1],
# color='red') +
geom_smooth(group=F, aes(y=pred_time), se=F, method = "loess")
Final model (log-level model) using log-transformed racing time:
Men: One year older is associated with 0.5% in running time
Women: One year older is associated with 0.3% in running time
On average, women is slower than men by 26.6%
On average, women is slower than men by 26.6%
model.panel_l_m <- plm(time~age,
data = rerun %>%
filter(gender=="M") %>%
mutate(gender= case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women"
)),
index=c("id", "year"),
model = "within",
effect = "time")
## Warning in pdata.frame(data, index): at least one NA in at least one index dimension in resulting pdata.frame
## to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
coeftest(model.panel_l_m, vcov. = vcovHC, type = "HC1")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## age 0.435910 0.018227 23.916 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Quadratic model
model.panel_q_m <- plm(time~ age + age2,
data = rerun %>%
filter(gender=="M") %>%
mutate(age2 = age^2) %>%
mutate(gender= case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women"
)),
index=c("id", "year"),
model = "within",
effect = "time")
## Warning in pdata.frame(data, index): at least one NA in at least one index dimension in resulting pdata.frame
## to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
coeftest(model.panel_q_m, vcov. = vcovHC, type = "HC1")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## age -0.4633102 0.1178320 -3.9320 8.442e-05 ***
## age2 0.0096500 0.0013148 7.3393 2.194e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Choose between the two (linear vs. quadratic)
Sum1 <- summary(model.panel_l_m)
RSS1 <- sum(Sum1$residuals^2)
K1 <- max(model.panel_l_m$assign)
N1 <- length(model.panel_l_m$residuals)
n1 <- N1 - K1 - model.panel_l_m$df.residual
AIC_model.panel_l_m = log(RSS1/n1) + (2*K1)/n1
Sum2 <- summary(model.panel_q_m)
RSS2 <- sum(Sum2$residuals^2)
K2 <- max(model.panel_q_m$assign)
N2 <- length(model.panel_q_m$residuals)
n2 <- N2 - K2 - model.panel_q_m$df.residual
AIC_model.panel_q_m = log(RSS2/n2) + (2*K2)/n2
Is linear model better than second order model?
AIC_model.panel_l_m <AIC_model.panel_q_m
## [1] TRUE
In this section, tracked runners are divided into 03 types of runners: Type 1 - Runners whose average performance is at most 60 minutes; Type 2 - Runners whose average performance is between 80 to 90 minutes; Type 3 - Runners who need at least 110 minutes on average to complete the race. The question investigated in this section is: Who have lower age-related decline (percent per decade) in performance? Linear model regression is used for simplicity of the interpretation.
# TWO GROUPS OF RUNNERS - ELITE RUNNERS (MORE RACES AND BETTER AVERAGE PERFORMANCE) vs the rest
## Create two types of runners
rerun <- rerun %>%
mutate(race_size = sapply(strsplit(pis.tis, "/"),
function(x) as.numeric(x[2]))) %>%
mutate(place = sapply(strsplit(pis.tis, "/"),
function(x) as.numeric(x[1])))
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
# print( rerun %>% filter( place > race_size ) )
rerun <- rerun %>%
filter(place <= race_size) # discard data errors
rerun <- rerun %>%
mutate(percentile = 10 * ceiling( 10 * place/race_size))
rerun <- rerun %>%
group_by(id) %>%
mutate(avg_time = mean(time),
avg_performance = mean(percentile))
rerun_type <- rerun %>%
group_by(id, name, races, year_of_birth, gender) %>%
summarise(avg_time = mean(time),
avg_performance = mean(percentile))
## `summarise()` has grouped output by 'id', 'name', 'races', 'year_of_birth'. You
## can override using the `.groups` argument.
densityPlot(rerun_type$avg_time, xlab = "Average running time"); title("Distribution of average running time");
library(scales)
bounds1 <- c(0, 60)
bounds2 <- c(80,90)
bounds3 <- c(110,160)
ggplot(rerun_type, aes(x=avg_time)) +
stat_density(geom = "line") +
stat_density(
geom = "area",
aes(x = stage(avg_time, after_stat = oob_censor(x, bounds1))),
alpha = 0.3
) +
stat_density(
geom = "area",
aes(x = stage(avg_time, after_stat = oob_censor(x, bounds2))),
alpha = 0.3
) +
stat_density(
geom = "area",
aes(x = stage(avg_time, after_stat = oob_censor(x, bounds3))),
alpha = 0.3
) +
xlab("Average running time in minutes (over multiple races)") +
ylab("Density") +
ggtitle("Distribution of average running time") +
annotate(
"text", bounds1[2]-5, y = 0.003,
label = percent(mean(!is.na(oob_censor(rerun_type$avg_time, bounds1)))))+
annotate(
"text", bounds2[2]-5, y = 0.003,
label = percent(mean(!is.na(oob_censor(rerun_type$avg_time, bounds2)))))+
annotate(
"text", bounds3[2]-25, y = 0.003,
label = percent(mean(!is.na(oob_censor(rerun_type$avg_time, bounds3))))
)
## Warning: Removed 448 rows containing missing values (position_stack).
## Warning: Removed 464 rows containing missing values (position_stack).
## Warning: Removed 301 rows containing missing values (position_stack).
## base on this plot ->
## 1. elite = complete the race no more than 60 minutes
## 2. complete the race in between 80-90 minutes
## 3. complete the race in more than 110 minutes
densityPlot(rerun_type[rerun_type$avg_time<=60,]$avg_performance)
densityPlot(rerun_type[rerun_type$avg_time<=60,]$races) # elite runners may not participate in many races
rerun_type <- rerun_type %>%
mutate(run_type=as.factor(case_when(
avg_time<=60 ~ 1,
avg_time>=80 & avg_time<=90 ~ 2,
avg_time>=110 ~ 3
)))
## Check how many people in each type of runners
rerun_type %>%
group_by(run_type, gender) %>%
summarise(num_runners=n())
<<<<<<< HEAD
=======
<<<<<<< HEAD
>>>>>>> 61b1d6c783da541131b5b49307924c86bde0dbeb
## `summarise()` has grouped output by 'run_type'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 3
## # Groups: run_type [4]
## run_type gender num_runners
## <fct> <fct> <int>
## 1 2 M 1402
## 2 2 W 616
## 3 3 M 165
## 4 3 W 419
## 5 1 M 166
## 6 1 W 4
## 7 <NA> M 3168
## 8 <NA> W 1836
<<<<<<< HEAD
=======
=======
## # A tibble: 4 × 2
## run_type num_runners
## <fct> <int>
## 1 2 2018
## 2 3 584
## 3 1 170
## 4 <NA> 5004
>>>>>>> 9f0c4fced4ea2a97b24453989f8ab2483201b2e0
>>>>>>> 61b1d6c783da541131b5b49307924c86bde0dbeb
## Merge back the type to the performance across years dataset
rerun <- rerun %>% left_join(y=rerun_type %>%
select(-avg_time,-avg_performance),
by = c("id", "name", "races",
"year_of_birth", "gender"))
## Regress Time running on age for each group
# list.panel_bytype <- lapply(1:3,
# function(i) {
# plm(time~age+gender,
# data = rerun %>%
# filter(run_type==i),
# index=c("id", "year"),
# model = "within",
# effect = "time")
# })
# sapply(list.panel_bytype,
# function(x) print(coeftest(x, vcov. = vcovHC, type = "HC1" )))
model.panel_t1 <- plm(time~age+gender+gender:age,
data = rerun %>% filter(run_type==1, age>=10),
index=c("id", "year"),
model = "within",
effect = "time")
# t1_check <- lm(time ~ age + gender + year,
# data = rerun %>% filter(run_type==1, age>=10))
# summary(t1_check)
model.panel_t2 <- plm(time~age+gender+gender:age,
data = rerun %>% filter(run_type==2, age>=10),
index=c("id", "year"),
model = "within",
effect = "time")
model.panel_t3 <- plm(time~age+gender+gender:age,
data = rerun %>% filter(run_type==3, age>=10),
index=c("id", "year"),
model = "within",
effect = "time")
coeftest(model.panel_t1, vcov. = vcovHC, type = "HC1")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## age 0.176138 0.029025 6.0685 1.793e-09 ***
## genderW 6.026735 2.435318 2.4747 0.01349 *
## age:genderW -0.152851 0.060905 -2.5097 0.01223 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(model.panel_t2, vcov. = vcovHC, type = "HC1")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## age 0.173264 0.011676 14.8395 < 2.2e-16 ***
## genderW 6.178681 0.880000 7.0212 2.296e-12 ***
## age:genderW -0.122804 0.021380 -5.7439 9.437e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(model.panel_t3, vcov. = vcovHC, type = "HC1")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## age 0.255991 0.053369 4.7966 1.676e-06 ***
## genderW 5.260250 2.986493 1.7613 0.07826 .
## age:genderW -0.093824 0.061142 -1.5345 0.12498
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The result from regression model suggest that:
For male runners, getting older 1 year will likely to slow them down by 0.17 ~ 0.18 minute if they are type 1 or type 3 runners, which would translate to almost 2 minutes after a decade. But for type 2 male runners, the effect of getting 1 year older is at 0.13 minutes on average.
For fmale runners, getting older 1 year will likely to slow them down by 0.18 ~ 0.2 minute if they are type 1 or type 3 runners, which would translate to almost 2 minutes after a decade. But for type 2 female runners, the effect of getting 1 year older is at 0.15 minutes on average.
The effect of age on running time may not necessarily different between genders among type 1 and type 3 runners. However, among type 2 runners, we are 95% confident that the effect of aging on women is different from that on men.
Again, type 1 are fastest group, type 3 are the slowest group. Based on this result, …
## Panel data graph for type-1,2,3 runners by gender with smooth line
ggplot(data = rerun %>%
filter(run_type==1, age>=10) %>%
mutate(gender=case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women")),
aes(x = age, y = time, group = id)) +
geom_line(aes(color=id)) + # line graph
xlab("Age") +
ylab("Race time") +
ggtitle("Running time of type-1 runners throughout races") +
facet_grid(.~ gender) +
theme(legend.position="none") +
# geom_abline(slope=model.panel_t1$coefficients[1],
# intercept= summary(t1_check)$coefficients[1,1],
# color='red') +
geom_smooth(group= F, method = "loess", fill = NA)
## `geom_smooth()` using formula 'y ~ x'
ggplot(data = rerun %>%
filter(run_type==2, age>=10) %>%
mutate(gender=case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women")),
aes(x = age, y = time, group = id)) +
geom_line(aes(color=id)) + # line graph
xlab("Age") +
ylab("Race time") +
ggtitle("Running time of type-2 runners throughout races") +
facet_grid(.~ gender) +
theme(legend.position="none") +
# geom_abline(slope=model.panel_t1$coefficients[1],
# intercept= summary(t1_check)$coefficients[1,1],
# color='red') +
geom_smooth(group= F, method = "loess", fill = NA)
## `geom_smooth()` using formula 'y ~ x'
<<<<<<< HEAD
## Warning: Computation failed in `stat_smooth()`:
## 'R_Calloc' could not allocate memory (162242632 of 8 bytes)
ggplot(data = rerun %>%
filter(run_type==3, age>=10) %>%
mutate(gender=case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women")),
aes(x = age, y = time, group = id)) +
geom_line(aes(color=id)) + # line graph
xlab("Age") +
ylab("Race time") +
ggtitle("Running time of type-3 runners throughout races") +
facet_grid(.~ gender) +
theme(legend.position="none") +
# geom_abline(slope=model.panel_t1$coefficients[1],
# intercept= summary(t1_check)$coefficients[1,1],
# color='red') +
geom_smooth(group= F, method = "loess", fill = NA)
## `geom_smooth()` using formula 'y ~ x'
## all-in-one graph
ggplot(data = rerun %>% filter(!is.na(run_type), age>=10)
%>% mutate(run_type=case_when(
run_type==1 ~ "Type 1",
run_type==2 ~ "Type 2",
run_type==3 ~ "Type 3"))
%>% mutate(gender=case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women"
)),
aes(x = age, y = time, group = id)) +
geom_line(aes(color=id)) + # line graph
xlab("Age") +
ylab("Race time") +
ggtitle("Running time of different runner type throughout races") +
facet_grid(rows = vars(gender),
cols = vars(run_type)) +
theme(legend.position="none") +
# geom_abline(slope=model.panel_t1$coefficients[1],
# intercept= summary(t1_check)$coefficients[1,1],
# color='red') +
geom_smooth(group= F, method = "loess", fill = NA)
## `geom_smooth()` using formula 'y ~ x'
<<<<<<< HEAD
## Warning: Computation failed in `stat_smooth()`:
## 'R_Calloc' could not allocate memory (162242632 of 8 bytes)
# QUESTIONS
# ? Who have lower age-related decline (percent per decade) in performance?
# elite repeat_runners or mortal (based on running time to decide who are elite repeat_runners
# or based on Cherry Blossom elite repeat_runners record)? men/women? -> recommendation
# to take up running as a hobby, professional training?
# https://www.frontiersin.org/articles/10.3389/fphys.2021.649282/full
# actually, I think this should be done with the tracked runner record rather
# than cross-section data - will look into it.
# https://researchoutreach.org/articles/ageing-change-sport-performance-master-athletes-answer/
Another categorization of runner type is by percentile within sex
quintile <- rerun_type %>%
group_by(gender) %>%
summarise(qt1=quantile(avg_time, probs = c(0.2)),
qt2=quantile(avg_time, probs = c(0.4)),
qt3=quantile(avg_time, probs = c(0.6)),
qt4=quantile(avg_time, probs = c(0.8)))
quintile[1,2]
## # A tibble: 1 × 1
## qt1
## <dbl>
## 1 72.0
rerun_type <- rerun_type %>%
mutate(run_type2=case_when(
gender=="M" & avg_time<=quintile[1,"qt1"] ~ 1,
gender=="M" & avg_time>=quintile[1,"qt2"] & avg_time<=quintile[1,"qt3"] ~ 2,
gender=="M" & avg_time>=quintile[1,"qt4"] ~ 3,
gender=="W" & avg_time<=quintile[2,"qt1"] ~ 1,
gender=="W" & avg_time>=quintile[2,"qt2"] & avg_time<=quintile[2,"qt3"] ~ 2,
gender=="W" & avg_time>=quintile[2,"qt4"] ~ 3)
)
## Check how many people in each type of runners
rerun_type %>%
group_by(run_type2, gender) %>%
summarise(num_runners=n())
## `summarise()` has grouped output by 'run_type2'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 3
## # Groups: run_type2 [4]
## run_type2 gender num_runners
## <dbl> <fct> <int>
## 1 1 M 981
## 2 1 W 575
## 3 2 M 981
## 4 2 W 575
## 5 3 M 981
## 6 3 W 575
## 7 NA M 1958
## 8 NA W 1150
## Merge back the type to the performance across years dataset
rerun <- rerun %>% left_join(y=rerun_type %>%
select(-avg_time,-avg_performance),
by = c("id", "name", "races",
"year_of_birth", "gender"))
## Regress Time running on age for each group
# list.panel_bytype <- lapply(1:3,
# function(i) {
# plm(time~age+gender,
# data = rerun %>%
# filter(run_type==i),
# index=c("id", "year"),
# model = "within",
# effect = "time")
# })
# sapply(list.panel_bytype,
# function(x) print(coeftest(x, vcov. = vcovHC, type = "HC1" )))
model.panel_t1 <- plm(time~age+gender+gender:age,
data = rerun %>% filter(run_type2==1, age>=10),
index=c("id", "year"),
model = "within",
effect = "time")
# t1_check <- lm(time ~ age + gender + year,
# data = rerun %>% filter(run_type==1, age>=10))
# summary(t1_check)
model.panel_t2 <- plm(time~age+gender+gender:age,
data = rerun %>% filter(run_type2==2, age>=10),
index=c("id", "year"),
model = "within",
effect = "time")
model.panel_t3 <- plm(time~age+gender+gender:age,
data = rerun %>% filter(run_type2==3, age>=10),
index=c("id", "year"),
model = "within",
effect = "time")
coeftest(model.panel_t1, vcov. = vcovHC, type = "HC1")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## age 0.204030 0.017237 11.8364 < 2.2e-16 ***
## genderW 14.757362 1.225128 12.0456 < 2.2e-16 ***
## age:genderW -0.094387 0.029172 -3.2355 0.001218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(model.panel_t2, vcov. = vcovHC, type = "HC1")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## age 0.132642 0.010971 12.0903 < 2.2e-16 ***
## genderW 15.450615 0.681703 22.6647 < 2.2e-16 ***
## age:genderW -0.064684 0.015972 -4.0499 5.159e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(model.panel_t3, vcov. = vcovHC, type = "HC1")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## age 0.313803 0.026307 11.9283 < 2.2e-16 ***
## genderW 17.396202 1.659973 10.4798 < 2.2e-16 ***
## age:genderW -0.129993 0.036828 -3.5298 0.0004177 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The result from regression model suggest that:
For male runners, getting older 1 year will likely to slow them down by 0.2 minutes if they are type 1,which would translate to almost 2 minutes after a decade. 0.1 if they are type 2 runners, or 1.3 minutes after a decade. 0.3 if they are type 3 runners, or 3.1 minutes after a decade.
For female runners, getting older 1 year will likely to slow them down by 0.1 minutes if they are type 1,which would translate to almost 1.1 minutes after a decade. 0.1 if they are type 2 runners, or 0.7 minutes after a decade. 0.2 if they are type 3 runners, or 1.8 minutes after a decade.
The effect of age on running time significantly different between genders at 95% confidence level.
## all-in-one graph
ggplot(data = rerun %>% filter(!is.na(run_type2), age>=10)
%>% mutate(run_type=case_when(
run_type2==1 ~ "Type 1",
run_type2==2 ~ "Type 2",
run_type2==3 ~ "Type 3"))
%>% mutate(gender=case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women"
)),
aes(x = age, y = time, group = id)) +
geom_line(aes(color=id)) + # line graph
xlab("Age") +
ylab("Race time") +
ggtitle("Running time of different runner type throughout races") +
facet_grid(rows = vars(gender),
cols = vars(run_type)) +
theme(legend.position="none") +
# geom_abline(slope=model.panel_t1$coefficients[1],
# intercept= summary(t1_check)$coefficients[1,1],
# color='red') +
geom_smooth(group= F, method = "loess", fill = NA)
## `geom_smooth()` using formula 'y ~ x'
Under observation that maximal oxygen uptake—a main correlate of running speed—varied in an inverse U trend across life-time (Rate and mechanism of maximal oxygen consumption decline with aging: Implications for exercise training. Sports Med. 2003, 33, 877–888.), it is possible that performance follows quadratic model, which is the basis of this section investigating runners’ peak age and peak performance. The information can be useful for athletes and trainers in designing training and practicing strategy.
# AGE OF PEAK PERFORMANCE
# Quadratic regression time~age + age2 + age:gender
## Using cross-sectional data
# model.peak <- lm(time~age+age2+age:gender,
# data=df %>% mutate(age2=age^2)); summary(model.peak)
#
# model.peak_m <- lm(time~age+age2,
# data=df %>%
# mutate(age2=age^2) %>%
# filter(gender=="M")); summary(model.peak_m)
# model.peak_w <- lm(time~age+age2,
# data=df %>%
# mutate(age2=age^2) %>%
# filter(gender=="W")); summary(model.peak_w)
# # peak age = x* = -b/2a
# pa_m <- -summary(model.peak_m)$coefficients[2,1]/(2*summary(model.peak_m)$coefficients[3,1]); pa_m
# pa_w <- -summary(model.peak_w)$coefficients[2,1]/(2*summary(model.peak_w)$coefficients[3,1]); pa_w
#
# # performance at peak age
# pperf_m <- predict(model.peak_m, data.frame(age=pa_m, age2=pa_m^2), level=0.95, interval= "confidence"); pperf_m
# pperf_w <- predict(model.peak_w, data.frame(age=pa_w, age2=pa_w^2), level=0.95, interval= "confidence"); pperf_w
## Using panel data
model.panel_pa <- plm(time~age+age2+gender,
data= rerun %>%
mutate(age2=age^2) %>%
filter(age>=10),
index= c("id","year"),
method="within",
effect = "time")
model.panel_pa <- lm(time ~ age + age2 + gender + year,
data = rerun %>%
mutate(age2=age^2) %>%
filter(age>=10))
summary(model.panel_pa)
##
## Call:
## lm(formula = time ~ age + age2 + gender + year, data = rerun %>%
## mutate(age2 = age^2) %>% filter(age >= 10))
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.517 -9.293 -0.748 8.250 87.341
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.8670636 13.4374534 4.976 6.5e-07 ***
## age -0.4551140 0.0360924 -12.610 < 2e-16 ***
## age2 0.0090433 0.0003909 23.137 < 2e-16 ***
## genderW 11.2395683 0.1253231 89.685 < 2e-16 ***
## year1974 -0.9933814 14.0782935 -0.071 0.9437
## year1975 -2.3333823 14.0783852 -0.166 0.8684
## year1976 -4.5331365 14.0784830 -0.322 0.7475
## year1977 -4.5843105 14.0785849 -0.326 0.7447
## year1978 -4.5578486 14.1495335 -0.322 0.7474
## year1979 10.6458619 13.4515298 0.791 0.4287
## year1980 8.1116455 13.4470218 0.603 0.5464
## year1981 9.6681936 13.4399815 0.719 0.4719
## year1982 9.2425933 13.4372075 0.688 0.4916
## year1983 7.6459327 13.4358834 0.569 0.5693
## year1984 8.6195616 13.4349304 0.642 0.5211
## year1985 7.9789353 13.4341828 0.594 0.5526
## year1986 8.8971155 13.4334821 0.662 0.5078
## year1987 8.8058036 13.4337605 0.655 0.5121
## year1988 10.2465640 13.4329188 0.763 0.4456
## year1989 9.7977912 13.4330400 0.729 0.4658
## year1990 9.7755849 13.4339604 0.728 0.4668
## year1991 11.7045878 13.4318987 0.871 0.3835
## year1992 10.9414743 13.4335657 0.814 0.4154
## year1993 12.4012001 13.4325281 0.923 0.3559
## year1994 14.7067119 13.4316541 1.095 0.2736
## year1995 14.5913269 13.4308184 1.086 0.2773
## year1996 14.7785865 13.4302891 1.100 0.2712
## year1997 16.9482372 13.4300186 1.262 0.2070
## year1998 16.1378272 13.4297539 1.202 0.2295
## year1999 17.1261132 13.4297784 1.275 0.2022
## year2000 16.3910697 13.4298681 1.220 0.2223
## year2001 17.0752851 13.4290439 1.272 0.2035
## year2002 18.2547982 13.4288284 1.359 0.1740
## year2003 17.4907958 13.4286053 1.303 0.1928
## year2004 18.2768672 13.4285801 1.361 0.1735
## year2005 19.6024597 13.4284878 1.460 0.1444
## year2006 19.8911598 13.4277755 1.481 0.1385
## year2007 18.9166377 13.4278778 1.409 0.1589
## year2008 20.2358517 13.4276823 1.507 0.1318
## year2009 20.7048677 13.4273329 1.542 0.1231
## year2010 21.5281955 13.4273332 1.603 0.1089
## year2011 21.8219556 13.4271863 1.625 0.1041
## year2012 21.2332633 13.4270697 1.581 0.1138
## year2013 22.4509260 13.4269588 1.672 0.0945 .
## year2014 22.8888970 13.4271996 1.705 0.0883 .
## year2015 18.2209050 13.4273990 1.357 0.1748
## year2016 24.3630068 13.4279092 1.814 0.0696 .
## year2017 23.6992434 13.4280573 1.765 0.0776 .
## year2018 24.8566220 13.4283783 1.851 0.0642 .
## year2019 24.0832667 13.4289767 1.793 0.0729 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.42 on 54970 degrees of freedom
## Multiple R-squared: 0.305, Adjusted R-squared: 0.3044
## F-statistic: 492.4 on 49 and 54970 DF, p-value: < 2.2e-16
model.panel_pa_m <- lm(time ~ age + age2 + year,
data = rerun %>%
mutate(age2=age^2) %>%
filter(age>=10, gender=="M"))
model.panel_pa_w <- lm(time ~ age + age2 + year,
data = rerun %>%
mutate(age2=age^2) %>%
filter(age>=10, gender=="W"))
# peak age = x* = -b/2a
pa_m <- -summary(model.panel_pa_m)$coefficients[2,1]/(2*summary(model.panel_pa_m)$coefficients[3,1]); pa_m
## [1] 23.48294
pa_w <- -summary(model.panel_pa_w)$coefficients[2,1]/(2*summary(model.panel_pa_w)$coefficients[3,1]); pa_w
## [1] 22.11456
# performance at peak age
pperf_m <- predict(model.panel_pa_m, data.frame(age=pa_m, age2=pa_m^2, year=as.factor(2019)), level=0.95, interval= "confidence"); pperf_m
## fit lwr upr
## 1 83.63635 82.60917 84.66353
pperf_w <- predict(model.panel_pa_w, data.frame(age=pa_w, age2=pa_w^2, year=as.factor(2019)), level=0.95, interval= "confidence"); pperf_w
## fit lwr upr
## 1 98.03589 96.81228 99.25951
From the record of tracked runners, filtered people at least 10 years old, using time-fixed effect model, the peak age of performance for men (for the year 2019) is 23.5 (95% confidence to be between 82.6, 84.7 minutes) and women is 22.1 (95% confidence to be between 96.8, 99.3 minutes).
# Plot
model.panel_pa_m <- lm(time ~ age + age2,
data = rerun %>%
mutate(age2=age^2) %>%
filter(age>=10, gender=="M"))
model.panel_pa_w <- lm(time ~ age + age2,
data = rerun %>%
mutate(age2=age^2) %>%
filter(age>=10, gender=="W"))
summary(model.panel_pa_m)
##
## Call:
## lm(formula = time ~ age + age2, data = rerun %>% mutate(age2 = age^2) %>%
## filter(age >= 10, gender == "M"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.012 -9.759 -1.477 8.014 86.506
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.1704518 1.0790555 79.86 <2e-16 ***
## age -0.6564144 0.0471098 -13.93 <2e-16 ***
## age2 0.0124058 0.0005001 24.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.96 on 35629 degrees of freedom
## Multiple R-squared: 0.1365, Adjusted R-squared: 0.1365
## F-statistic: 2817 on 2 and 35629 DF, p-value: < 2.2e-16
summary(model.panel_pa_w)
##
## Call:
## lm(formula = time ~ age + age2, data = rerun %>% mutate(age2 = age^2) %>%
## filter(age >= 10, gender == "W"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.383 -10.055 -0.662 9.237 75.986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.1197698 1.4101257 69.582 < 2e-16 ***
## age -0.4178073 0.0658674 -6.343 2.3e-10 ***
## age2 0.0080455 0.0007424 10.837 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.48 on 19385 degrees of freedom
## Multiple R-squared: 0.04702, Adjusted R-squared: 0.04692
## F-statistic: 478.3 on 2 and 19385 DF, p-value: < 2.2e-16
# peak age = x* = -b/2a
pa_m <- -summary(model.panel_pa_m)$coefficients[2,1]/(2*summary(model.panel_pa_m)$coefficients[3,1]); pa_m
## [1] 26.45592
pa_w <- -summary(model.panel_pa_w)$coefficients[2,1]/(2*summary(model.panel_pa_w)$coefficients[3,1]); pa_w
## [1] 25.96515
# performance at peak age
pperf_m <- predict(model.panel_pa_m, data.frame(age=pa_m, age2=pa_m^2), level=0.95, interval= "confidence"); pperf_m
## fit lwr upr
## 1 77.48743 77.07538 77.89947
pperf_w <- predict(model.panel_pa_w, data.frame(age=pa_w, age2=pa_w^2), level=0.95, interval= "confidence"); pperf_w
## fit lwr upr
## 1 92.69555 92.21839 93.17272
# Plot
ggplot(data = rerun %>% filter(age>=10, gender=="M")
%>% mutate(gender=case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women"
)),
aes(x = age, y = time, group = id)) +
geom_line(aes(color=id)) + # line graph
xlab("Age") +
ylab("Race time") +
ggtitle("Peak age and performance of men") +
# facet_grid(rows = vars(gender)) +
theme(legend.position="none") +
geom_smooth(group= F, method = "lm",
formula = y ~ poly(x,2),
level = 1,
se = T,
fill = NA) +
geom_point(aes(x=pa_m[1], y=pperf_m[1]), color="black", size=3) +
annotate("text", x=pa_m[1]+5, y=pperf_m[1]+5,
label=paste0("(",
round(pa_m[1], digits = 1),
", ",
round(pperf_m[1], digits = 1),
")"),
size=4)
ggplot(data = rerun %>% filter(age>=10, gender=="W")
%>% mutate(gender=case_when(
gender=="M" ~ "Men",
gender=="W" ~ "Women"
)),
aes(x = age, y = time, group = id)) +
geom_line(aes(color=id)) + # line graph
xlab("Age") +
ylab("Race time") +
ggtitle("Peak age and performance of women") +
# facet_grid(rows = vars(gender)) +
theme(legend.position="none") +
geom_smooth(group= F, method = "lm",
formula = y ~ poly(x,2),
level = 1,
se = T,
fill = NA) +
geom_point(aes(x=pa_w[1], y=pperf_w[1]), color="black", size=3) +
annotate("text", x=pa_w[1]+5, y=pperf_w[1]+5,
label=paste0("(",
round(pa_w[1], digits = 1),
", ",
round(pperf_w[1], digits = 1),
")"),
size=4)
From the record of tracked runners, filtered people at least 10 years old, using quadratic regression model, the peak age of performance for men is 26.5 (95% confidence to be between 77.1, 77.9 minutes) and women is 26 (95% confidence to be between 92.2, 93.2 minutes).
random_subset <- df %>% sample_n(200) # oof idk man
# # DISABLED: this takes a reallyyyyy long time: temporarily disabling
# resid.lo <- loess(resids ~ age,
# data = data.frame(resids = rstandard(model),
# age = df$age))
# res.lo.pred <- predict(resid.lo, newdata = data.frame(age = 10:100))
# smoothScatter(x = df$age,
# y = model$residuals,
# xlab = "ages",
# ylab = "Residuals")
# abline(h = 0)
# lines(x = 10:100,
# y = res.lo.pred,
# col = "red",
# lwd = 2,
# lty = 3)
rdplot(narrow$yvar, narrow$school_enrollment, c = 40.5, p = 1, nbins = 20)
#Collapse data
by_school <- group_by(narrow, schlcode)
schools <- summarise(by_school, school_enrollment =
mean(school_enrollment, na.rm = TRUE))
#Draw graph
ggplot(schools, aes(school_enrollment)) +
geom_histogram(bins = 40) +
geom_vline(xintercept = 40.5, color = "red")
#Save graph
ggsave("school_counts.png")
# Use PiS/Tis to create place in percentile
## Extract position and total number of repeat_runners in each race
df <- df %>%
mutate(race_size = sapply(strsplit(place, "/"),
function(x) as.numeric(x[2]))) %>%
mutate(place = sapply(strsplit(place, "/"),
function(x) as.numeric(x[1])))